import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import seaborn as sns
from IPython.display import HTML
Let's load data from a CSV file and print out a few lines.
df = pd.read_csv('tmdb-movies.csv')
df.head(2)
df.shape
We'll be analyzing a dataframe with 21 columns and 10866 entries. We can notice that columns “cast”, “genres” and “production_companies” contain multiple values separated by '|' character.
df.info()
There are several columns with missing values. For example "homepage" has only 2936 values, "tagline" has 8042 values. Luckily all these columns do not make any sense for further analyzing and can be dropped. So, the first thing we're going to do is to drop all unnecessary columns.
df.drop(['id', 'imdb_id', 'homepage', 'tagline', 'keywords', 'overview'],
axis=1,
inplace=True)
df.shape
Now we have only 15 columns left.
We can notice that "release_date" Dtype is object. It means date is in string format. It's better to convert it to special datetime format.
df.release_date = pd.to_datetime(df['release_date'])
df.info()
df.describe()
"budget", "budget_adj", "revenue", "revenue_adj", "runtime" columns have zero values. Obviously, these values can't be zero and I think they are supposed to be missing. We have to change them to NaN so that they’ll not influence statistics.
df['budget']=df['budget'].replace(0, np.nan)
df['budget_adj']=df['budget_adj'].replace(0, np.nan)
df['revenue']=df['revenue'].replace(0, np.nan)
df['revenue_adj']=df['revenue_adj'].replace(0, np.nan)
df['runtime']=df['runtime'].replace(0, np.nan)
df.describe()
df.hist(figsize=(10,8));
We can notice that “budget”, “revenue”, "popularity", "vote_count" histograms are extremely right skewed. Max values of these columns stand out of all other numbers. For example, "popularity" mean value is around 0.64, standard deviation is around 1.0, 75% values are lower than 1.0, but the max value is almost 33!
"release_year" histogram is left skewed. It means the number of movie releases increases every year.
"vote_average" is almost normally distributed as it should be.
df.info()
There are still several columns with null values. But it makes no sense to drop these entries. Better tactic would be to use as much data as possible for every question.
Now that we've trimmed and cleaned our data, we're ready to move on to exploration. Compute statistics and create visualizations with the goal of addressing the research questions that we posed in the Introduction section.
To answer this question we'll create an animated horizontal bar chart.
Animated bar chart https://towardsdatascience.com/bar-chart-race-in-python-with-matplotlib-8e687a5c8a41
Splitting values https://medium.com/@lsriniv/splitting-columns-of-series-dataframe-into-rows-step-by-step-explanation-9fce88ed1144
Previously we mentioned that “genres” column contains multiple values separated by '|' character. So we have to split them in order to form the list of all genres.
genres = df.genres.str.split('|', expand=True).stack().value_counts().index
print("Number of genres is {}".format(genres.size))
We have 20 genres overall. Let's create a color map for them, so that every genre would have a unique color. Choosing colors is a very complicated task, so we’ll use built-in matplotlib “tab20” colormap that has exactly 20 colors with a good-looking palette.
colors_map = {}
cm = plt.cm.get_cmap('tab20')
#we have 20 colors in [0-1] range
#so start from 0.025 and add 0.05 every cycle
#this way we get different colors for
#every genres
off = 0.025
for genre in genres:
colors_map[genre] = cm(off)
off += 0.05
Let's create a function that returns a sorted dataframe with dependency of values from a multiple value column and a single value column. It will help us to analyse all multiple values columns.
def get_mdepend(df, multival_col, qual_col):
#split column by '|' character and stack
split_stack = df[multival_col].str.split('|', expand=True).stack()
#convert series to frame
split_frame = split_stack.to_frame(name=multival_col)
#drop unneeded index
split_frame.index = split_frame.index.droplevel(1)
#add qual_col, group and find average
dep = split_frame.join(df[qual_col]).groupby(multival_col).mean()
#return sorted dependency
return dep.sort_values(qual_col)
Next we'll create a function that plots our horizontal bar chart with the popularity of movies for all genres up to desired year.
def draw_barchart_frame(current_year):
#get data only up to current_year
dep = get_mdepend(df.query('release_year <= {}'.format(current_year)),
'genres', 'popularity')
#clear before draw
ax.clear()
#plot horizontal barchart using our colormap
ax.barh(dep.index,
dep['popularity'].tolist(),
color=[colors_map[x] for x in dep.index])
#plot genres and values
dx = dep.max() / 200
for i, (value,
name) in enumerate(zip(dep['popularity'].tolist(), dep.index)):
#genre name
ax.text(value - dx,
i,
name,
size=14,
weight=600,
ha='right',
va='center')
#genre value
ax.text(value + dx,
i,
f'{value:,.2f}',
size=14,
ha='left',
va='center')
#big current year
ax.text(1,
0.2,
current_year,
transform=ax.transAxes,
color='#777777',
size=46,
ha='right',
weight=800)
#plot caption of ticks
ax.text(0,
1.065,
'Popularity',
transform=ax.transAxes,
size=14,
color='#777777')
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.1f}'))
ax.xaxis.set_ticks_position('top')
ax.tick_params(axis='x', colors='#777777', labelsize=12)
ax.set_yticks([])
ax.margins(0, 0.01)
ax.grid(which='major', axis='x', linestyle='-')
ax.set_axisbelow(True)
#chart caption
ax.text(0,
1.16,
'Popularity of movie genres from 1960 to 2015',
transform=ax.transAxes,
size=24,
weight=600,
ha='left',
va='top')
Finally we'll create an animation.
#create figure
fig, ax = plt.subplots(figsize=(10, 8))
#remove borders
plt.box(False)
#immediately close it to not provide additional figure
#after animation block
plt.close()
animator = animation.FuncAnimation(fig,
draw_barchart_frame,
frames=range(1960, 2016),
interval=666)
#add space before animation
print('')
HTML(animator.to_jshtml())
We have to get data only with budget and revenue values available. Also I think it's better to use adjusted values so that money inflation doesn't interfere our calculations.
dfp = df[df.budget_adj.notnull() & df.revenue_adj.notnull()].copy()
dfp['profit'] = dfp.revenue_adj - dfp.budget_adj
Let's try to find if there is any correlation with profit.
sns.pairplot(data=dfp,
x_vars=['popularity', 'budget', 'runtime'],
y_vars=['profit'],
kind='reg');
sns.pairplot(data=dfp,
x_vars=['vote_count', 'vote_average', 'release_year'],
y_vars=['profit'],
kind='reg');
"popularity" and "vote_count" have a positive correlation over profit. Obviously, the more people watch the movie the more revenue it gets.
"budget" has a small positive correlation. So we can conclude that higher investments in movies cause higher revenues.
Surprisingly, "vote_average" has a weak positive correlation with profit.
Let's configure default parameters for our plots like figure size and font sizes.
params = {
'legend.fontsize': 'x-large',
'figure.figsize': (10, 8),
'axes.labelsize': 'x-large',
'axes.titlesize': 'xx-large',
'xtick.labelsize': 'x-large',
'ytick.labelsize': 'x-large'
}
plt.rcParams.update(params)
Next we will implement a function to plot a profit chart.
def profit_chart(df, title):
#create figure
ax = df.plot(kind='barh')
#remove legend from plot
ax.get_legend().remove()
#set custom axis formatter for millions
ax.xaxis.set_major_formatter(
ticker.FuncFormatter(lambda x, p: format(int(x / 1e6), ',')))
#set titles and labels
plt.title(title)
plt.xlabel('Average Profit ($ million)')
plt.ylabel(None)
Let's find out how profit depends from genre.
profit_chart(get_mdepend(dfp, 'genres', 'profit'), 'Movie Profits By Genre')
There are lots of production companies. Let's find out TOP-10 profitable production companies.
profit_chart(
#last 10 values
get_mdepend(dfp, 'production_companies', 'profit').tail(10),
'TOP-10 Profitable Production Companies')
Let's find out TOP-10 profitable actors.
profit_chart(
get_mdepend(dfp, 'cast', 'profit').tail(10), 'TOP-10 Profitable Actors')
Let's look how profit depends from release year.
year_profit_mean = dfp.groupby('release_year')['profit'].mean()
#rolling average over 10-years
year_profit_rol = year_profit_mean.rolling(window=10).mean()
ax = year_profit_rol.plot(figsize=(12, 8))
#set custom axis formatter
ax.yaxis.set_major_formatter(
ticker.FuncFormatter(lambda y, p: format(int(y / 1e6), ',')))
#set titles and labels
plt.title('Movie Profits By Year')
plt.ylabel('Average Profit ($ million)')
plt.xlabel("Release Year");
We can see how huge the difference between profits in 1980s and nowadays. It is probably connected with the fact that number of movies every year was increasing accordingly. Let's quickly look at graph to prove our findings.
#we know that popularity column has no nulls
#so use it for counting the number of movies
ax = df.groupby('release_year')['popularity'].count().plot(figsize=(12, 8))
#set titles and labels
plt.title('Number Of Movies Per Year')
plt.ylabel('Number Of Movies')
plt.xlabel("Release Year");
We can notice that the number of movies over year moving in the opposite direction with movie profits over year.
Finally, we can summarize our findings and answer questions from the Introduction section.
By looking at animated bar charts of genres' popularity we can watch how movie trends were changing over the years.
Properties of high profit movies:
Limitations: There is a lot of missing data in the dataset.